#install.packages("rmarkdown")
library(rmarkdown)

Load the necessary R libraries

#if (!requireNamespace("BiocManager", quietly = TRUE))
    #install.packages("BiocManager")

#BiocManager::install("treeio")
#devtools::install_github("GuangchuangYu/ggtree")
library(treeio)
## treeio v1.19.1  For help: https://yulab-smu.top/treedata-book/
## 
## If you use treeio in published research, please cite:
## 
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package for phylogenetic tree input and output with richly annotated and associated data. Molecular Biology and Evolution 2020, 37(2):599-603. doi: 10.1093/molbev/msz240
library(ggtree)
## ggtree v3.3.1  For help: https://yulab-smu.top/treedata-book/
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## 1. Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
## 2. Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution. 2018, 35(12):3041-3043. doi:10.1093/molbev/msy194
## 3. Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
library(phytools)
## Loading required package: ape
## 
## Attaching package: 'ape'
## The following object is masked from 'package:ggtree':
## 
##     rotate
## The following object is masked from 'package:treeio':
## 
##     drop.tip
## Loading required package: maps
## 
## Attaching package: 'phytools'
## The following object is masked from 'package:treeio':
## 
##     read.newick
library(viridisLite)
library(ggplot2)
library(googlesheets4)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load data from google spreadsheet (Already stored in an R object that you can just load)

#id<-"1e7Hj_BQ8rke5XW7j93YL6QVAGbcvsb2yw7oD86WnPJY"
#gsheet<-read_sheet(id,sheet="GlobalOverview",skip = 2,na = c(""," ","NA","#DIV/0!"),col_types = "c",trim_ws=T)
#GlobalOverview<-as.data.frame(gsheet,stringsAsFactors=F)
#save(GlobalOverview,file="../../misc/data/gsheet.GlobalOverview.21-09-22.Robj")
load("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/gsheet.GlobalOverview.21-09-22.Robj")

Create a dataframe where the first row is the GAGA id (e.g. GAGA-0002), which are also used as the tip labels in the phylogeny

data2add<-data.frame(GAGA.id = GlobalOverview$`GAGA ID`, GlobalOverview)

Gather subfamily info for NCBI IDs as well and manually add the subfamily

colnames(data2add)
##  [1] "GAGA.id"                                  
##  [2] "RNAseqID"                                 
##  [3] "RNAseqComment"                            
##  [4] "Worker"                                   
##  [5] "Large.worker"                             
##  [6] "Small.worker"                             
##  [7] "Soldier"                                  
##  [8] "Gyne"                                     
##  [9] "Queen"                                    
## [10] "Male"                                     
## [11] "Larvae"                                   
## [12] "Pupae"                                    
## [13] "Brood"                                    
## [14] "Mixed"                                    
## [15] "ENTRY.FROM.Species.ID.polished"           
## [16] "original.species.ID"                      
## [17] "final.species.ID"                         
## [18] "ENTRY.FROM.SampleType"                    
## [19] "PacBIo"                                   
## [20] "stLFR...19"                               
## [21] "HiC"                                      
## [22] "Collector.Firstname"                      
## [23] "Collector.Name"                           
## [24] "Sample.ID"                                
## [25] "Species"                                  
## [26] "PacBio.library"                           
## [27] "stLFR.library"                            
## [28] "Short.reads..WGS..library"                
## [29] "RNAseq.library"                           
## [30] "PacBio...29"                              
## [31] "stLFR...30"                               
## [32] "Short.reads..WGS....31"                   
## [33] "Hi.C...32"                                
## [34] "RNAseq"                                   
## [35] "Subfamily"                                
## [36] "Country"                                  
## [37] "Comment"                                  
## [38] "ENTRY.FROM.GAGA.SUBMISSION"               
## [39] "GAGA.ID"                                  
## [40] "ENTRY.FROM.ASSEMBLY.OVERVIEW"             
## [41] "Species..from.Assembly.Overview."         
## [42] "PacBio...41"                              
## [43] "stLFR...42"                               
## [44] "Short.reads..WGS....43"                   
## [45] "Hi.C...44"                                
## [46] "RNA..additional.samples.still.under.seq.."
## [47] "PacBio.assembly"                          
## [48] "PacBio.stLFR.WGS.polished.assembly"       
## [49] "PacBio.stLFR.scaffolded"                  
## [50] "PacBio.Hi.C.scaffolded"                   
## [51] "stLFR.assembly"                           
## [52] "Current.state"                            
## [53] "Genome.size...52"                         
## [54] "Number.of.scaffolds...53"                 
## [55] "Scaffold.N50...54"                        
## [56] "Contig.N50...55"                          
## [57] "Longest.scaffold...56"                    
## [58] "BUSCO.Eukaryota...57"                     
## [59] "BUSCO.Hymenoptera...58"                   
## [60] "Duplicated.scaffolds"                     
## [61] "Total.length.of.duplicated.scaffolds"     
## [62] "Percentage.of.duplication"                
## [63] "Comments"                                 
## [64] "Bacterial.scaffolds.count"                
## [65] "Total.length.of.bacterial.scaffolds"      
## [66] "Bacterial.scaffold.prevalence.."          
## [67] "Top.bacterial.taxon"                      
## [68] "Putative.misassembly.count"               
## [69] "Putative.misassemblies...100Kb..count"    
## [70] "Human.scaffold.count"                     
## [71] "Genome.size...70"                         
## [72] "Number.of.scaffolds...71"                 
## [73] "Scaffold.N50...72"                        
## [74] "Contig.N50...73"                          
## [75] "Longest.scaffold...74"                    
## [76] "L50"                                      
## [77] "L90"                                      
## [78] "BUSCO.Eukaryota...77"                     
## [79] "BUSCO.Hymenoptera...78"                   
## [80] "Short.read.mapping.rate"                  
## [81] "Completeness"                             
## [82] "QV"                                       
## [83] "RNA.seq.mapping.rate"                     
## [84] "Repeat.annotation"                        
## [85] "Gene.annotation"
subset(data2add,is.na(Subfamily))
data2add$Subfamily[data2add$GAGA.id=="NCBI-0001"]<-"Dorylinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0002"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0003"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0004"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0005"]<-"Formicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0006"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0007"]<-"Ponerinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0008"]<-"Formicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0009"]<-"Ponerinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0010"]<-"Dolichoderinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0011"]<-"Ponerinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0012"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0013"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0014"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0015"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0016"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="NCBI-0017"]<-"Myrmicinae"
data2add$Subfamily[data2add$GAGA.id=="OUT-0001"]<-"Formicinae"
data2add$Subfamily[data2add$GAGA.id=="OUT-0002"]<-"Formicinae"

Create a column with clean species IDs for all species.

This has to be done, because the database of all the sequenced species is a bit convoluted. The function coalesce() finds the first non-missing value at each position. In this case, the columns “Species” and “Species..from.Assembly.Overview” are collapsed. If the column “Species” does not have an entry, the entry from the column “Species..from.Assembly.Overview” will be taken.

data2add$cleanSpeciesID<-dplyr::coalesce(data2add$Species,data2add$Species..from.Assembly.Overview.)

Load the tree file of all the GAGA genomes.

tree <- read.iqtree("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/hymenoptera_psc98_partition_b1000_40threads_msubnucl.treefile")

Create a very quick and dirty plot of the phylogeny

plot(tree@phylo,show.node.label = T,show.tip.label = T,cex=.5)
nodelabels(bg=rgb(0,0,0,0),col="red",frame = "none")

Check the plot to see where the root should be. Here we root the tree at node 166 that separates the outgroups, the honeybee “Amel” (Apis mellifera) and the parasitoid wasp Nasonia (Nvit), from ants.

#tree.rooted<-treeio::root(tree,node=166)
tree.rooted <- tree
tree.rooted.subset<-treeio::drop.tip(tree.rooted,tip=c("NCBI-0018_Amel","NCBI-0019_Nvit"))

Plot basic tree

tp<-ggtree(tree.rooted.subset,ladderize = T, layout = "circular")+geom_rootedge(.01)

Add the species identity from FAS.genes.and.names.tsv

tp$data$label[tp$data$isTip==T]<-gsub("_.*","",tp$data$label[tp$data$isTip==T])

tp2<-tp %<+% data2add
tp2

1 A) Make complex tree with the subfamilies highlighted by color

#install.packages("ggnewscale")
library("ggnewscale")

#Make Subfamily as a factor (discrete color scale for factors)
tp2$data$Subfamily<-as.factor(tp2$data$Subfamily)

tp3<-tp2 +
  geom_tiplab2(aes(label=paste(label,cleanSpeciesID,sep=" "), color = Subfamily),size=2,align = T,linesize = .1)+    #add the tip labels
  geom_nodepoint(aes(fill=UFboot),stroke=0.5,pch=21,size=2)+       #add the info about bootstrapping success as points of different colors to the nodes
 scale_fill_viridis_c()+  #change the color of the UFboot node points to a viridis color scale
  theme(legend.position = "left",
        legend.title=element_text(size=11), # the title size of legend.
        legend.text=element_text(size=10)
        )
tp3 #plot the tree

#ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/Subfamilies.pdf",
#width = 15,height=30) #save the tree in a pdf

2 A.1) LGT data: Include the LGT data in the circular phylogeny

#Add the raw LGT dataframe
library(readxl)
LGTs.raw<-read_excel("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/GAGA.LGT.162.resulting.good.single.candidates.filtered.xlsx")
LGTs.raw$comment <- NULL

#Clean up the LGT dataframe to remove the database origin (GAGA-0001.euk -> GAGA-0001)
LGTs<-LGTs.raw
LGTs$.id<-gsub("\\..*","",LGTs$.id)

3 A.1.1) Add the information from the other excel dataframes containing the dfast results and expression and combine everything into one df

LGTs.dfast<-read_excel("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/LGTs.dfast.incorporated.xlsx")
LGTs.expression<-read_excel("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/GAGA.LGT.completeness.expression.xlsx")

#Join the LGTs df and the LGTs.dfast df together. With full_join, both dataframes will be merged and identical columns will be kept, as well as new ones added.
LGTs.dfast$start_overlap<-as.numeric(LGTs.dfast$start_overlap)
## Warning: NAs introduced by coercion
LGTs.dfast$end_overlap<-as.numeric(LGTs.dfast$end_overlap)
## Warning: NAs introduced by coercion
LGTs.dfast$complete_overlap<-as.numeric(LGTs.dfast$complete_overlap)
## Warning: NAs introduced by coercion
LGTs.dfast$nr_predicted_loci<-as.numeric(LGTs.dfast$nr_predicted_loci)

#LGTs.combined <- full_join(LGTs,LGTs.dfast)

#Joining, by = c("cand.locus", "locus", ".id", "cand.start", "cand.end", "cand.length", "bestProHit", #"BitDiffSum", "cand.scaffold", "locus.start", "locus.end", "start_overlap", "end_overlap", #"complete_overlap", "species", "subfamily", "nr_predicted_loci", "prokaryote_origin", #"order_prokaryote_origin", "class_prokaryote_origin", "predicted_protein", "assembly_method", #"Wolbachia_infection", "CPH_avail")

#Clean the LGT.combined df
#LGTs.combined$LGT_quality<-NULL
#write.table(LGTs.combined, file='/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/LGTs.combined.dfast.tsv', quote=FALSE, sep='\t', col.names = NA)
#write.table(LGTs.expression, file='/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/LGTs.expression.tsv', quote=FALSE, sep='\t', col.names = NA)

4 A.2) LGT data: Add the LGT candidate numbers per genome to the phylogeny

Plot the circular phylogeny and add red dot to the tip, where the size of the red dot represents the number of identified LGTs.

#Create a tally dataframe that counts how many LGT candidates we have per species.
LGT.tally<-as.data.frame(table(select(LGTs,.id)))
colnames(LGT.tally)<-c("GAGAid","LGT.count")

#Add the LGT.tally dataframe to the phylogeny object as additional data.
tp4<-tp3 %<+% LGT.tally
#see tp4$data$LGT.count

#Plot the phylogeny and add red dot to the tip, where the size of the red dot represents the number of identified LGTs
tp5<-tp4+geom_tippoint(aes(size=LGT.count),col="red", alpha=.5)
tp5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

#ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.LGTs.pdf",width = 12,height=24) #save the tree in a pdf

#write.table(LGT.tally, file="GAGAspecies_with_lgts.tsv", sep=",", col.names = NA)

5 A.3) LGT data: Add the different LGT categories to the phylogeny as a heatmap

Create a dataframe that counts the genomes with ankyrin repeats

#Only select the ankyrins to show them as a heatmap
LGT.ankyrins<-subset(LGTs,predicted_protein=="ankyrin repeat protein") %>% group_by(.id) %>% tally()
colnames(LGT.ankyrins)<-c(".id","protein")

LGT.ankyrins$type<-"ankyrin repeat protein"
knitr::kable(head(LGT.ankyrins))
.id protein type
GAGA-0020 10 ankyrin repeat protein
GAGA-0024 7 ankyrin repeat protein
GAGA-0028 29 ankyrin repeat protein
GAGA-0063 2 ankyrin repeat protein
GAGA-0087 38 ankyrin repeat protein
GAGA-0098 2 ankyrin repeat protein
#Only select the ankyrins and lysozymes to show them on the phylogeny
LGT.anks.lys<-subset(LGTs,predicted_protein %in% c("ankyrin repeat protein", "Lysozyme")) %>% group_by(.id)

LGT.anks.lys<-LGT.anks.lys[,c(26,3)]
#LGT.anks.lys <- LGT.anks.lys %>% count(.id,predicted_protein, sort = T)
#colnames(LGT.anks.lys)<-c("GAGAid","protein","count")

#knitr::kable(head(LGT.anks.lys))
#BiocManager::install("ggstar")
library(ggtree)
library(ggtreeExtra)
## ggtreeExtra v1.4.1  For help: https://yulab-smu.top/treedata-book/
## 
## If you use ggtreeExtra in published research, please cite the paper:
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact visualization of richly annotated phylogenetic data. Molecular Biology and Evolution 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
library(ggplot2)
library(ggnewscale)
library(dplyr)
library(tidytree)
## 
## Attaching package: 'tidytree'
## The following object is masked from 'package:stats':
## 
##     filter
library(ggstar)

6 A.4) Plot all predicted proteins onto the phylogeny

LGT.proteins<-LGTs[,c(26,3)]
LGT.proteins <- LGT.proteins %>% count(.id,predicted_protein, sort = T)
colnames(LGT.proteins)<-c("GAGAid","protein","count")

LGT.proteins <- na.omit(LGT.proteins)
knitr::kable(head(LGT.proteins))
GAGAid protein count
GAGA-0087 ankyrin repeat protein 38
GAGA-0028 ankyrin repeat protein 29
GAGA-0401 ankyrin repeat protein 15
GAGA-0020 ankyrin repeat protein 10
GAGA-0223 ankyrin repeat protein 9
GAGA-0364 ankyrin repeat protein 9
library("viridis")
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
## 
##     unemp
# For the bar layer outside of the tree
tp5.5 <- tp5 +
  new_scale_fill() +
  geom_fruit(
          data=LGT.proteins,
          geom=geom_bar,
          mapping=aes(y=GAGAid, x=count,fill=protein), #The count will be mapped to x
          pwidth=1,      # Width of the external layer
          stat="identity",
          orientation="y", #The orientation of the axis
          position=position_stackx(hexpand=1),   #Adjust the position of the geom_layer
          axis.params =list(
                         axis="x",       # add axis text of the layer.
                         vjust=0.2,
                         text.angle=0,   # the text size of axis.
                         text.size=1.8,
                         hjust=1,        # adjust the horizontal position of text of axis
                         nbreak=1,       # number of lines within geom_layer,
                           ),
         grid.params=list(alpha=.5)      # Parameter to adjust gridlines around the tree
      ) +
      scale_fill_discrete(     #color of the external geom_layer
       name="Protein",        #name of the legend for the external geom_layer
       guide=guide_legend(keywidth=1,keyheight=1,order=6, ncol=6)
      ) +
      theme(legend.position="bottom", # the position of legend.
            legend.box="vertical", legend.margin = margin(0,0,0,0), #change the spacing between the legend and the plot
            legend.box.margin=margin(10,10,10,10),
          legend.background=element_rect(fill=NA), # the background of legend.
          legend.title=element_text(size=12), # the title size of legend.
          legend.text=element_text(size=10) # the text size of legend.
          #legend.spacing.y = unit(0.01, "cm")
      )
tp5.5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

#ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.proteins.pdf",width=13,height=26) #save the tree in a pdf

7 A.5) Subset the LGT count into Wolbachia and Blochmannia origin

LGT.bacteria<-LGTs[,c(22,3)]
LGT.bacteria <- LGT.bacteria %>% count(.id,prokaryote_origin, sort = T)
colnames(LGT.bacteria)<-c("GAGAid","prokaryote_origin","count")

LGT.bacteria <- na.omit(LGT.bacteria)
LGT.bacteria.WB.BM <- subset(LGT.bacteria,prokaryote_origin %in% c("Wolbachia", "Blochmannia")) %>% group_by(GAGAid)
knitr::kable(head(LGT.bacteria.WB.BM))
GAGAid prokaryote_origin count
GAGA-0087 Wolbachia 39
GAGA-0028 Wolbachia 38
GAGA-0401 Wolbachia 23
GAGA-0513 Wolbachia 16
GAGA-0223 Wolbachia 15
GAGA-0510 Wolbachia 13

8 A.6) Add the bacterial origin

tp6 <- tp5.5 +
  new_scale_fill() +
  geom_fruit(
          data=LGT.bacteria,
          geom=geom_bar,
          mapping=aes(y=GAGAid, x=count,fill=prokaryote_origin), #The count will be mapped to x
          pwidth=1,      # Width of the external layer
          stat="identity",
          orientation="y", #The orientation of the axis
          position=position_stackx(hexpand=.7),   #Adjust the position of the geom_layer
          axis.params =list(
                          axis="x",       # add axis text of the layer.
                          vjust=0.2,
                          text.angle=0,   # the text size of axis.
                          text.size=1.5,
                          hjust=1,        # adjust the horizontal position of text of axis
                          nbreak=1,       # number of lines within geom_layer,
                          ),
          grid.params=list(alpha=.5)      # Parameter to adjust gridlines around the tree
      ) +
     scale_fill_manual(
       values=c("dodgerblue4","cornflowerblue","paleturquoise4","grey"),          #color of the external geom_layer
       name="Prokaryote origin",        #name of the legend for the external geom_layer
       breaks=c("Wolbachia","Blochmannia","Spiroplasma", "Other"),
       guide=guide_legend(keywidth=1,keyheight=1,order=6, ncol=4)
      ) +
      theme(legend.position="bottom", # the position of legend.
            legend.box="vertical", legend.margin = margin(0,0,0,0), #change the spacing between the legend and the plot
            legend.box.margin=margin(20,20,20,20), #change the spacing between the legend and the plot
          legend.background=element_rect(fill=NA), # the background of legend.
          legend.title=element_text(size=12), # the title size of legend.
          legend.text=element_text(size=10) # the text size of legend.
         # legend.spacing.y = unit(0.01, "cm")
      )
tp6
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

#ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.proteins.bacteria.changed.colours.pdf",width=20,height=25) #save the tree in a pdf
#graph2ppt(file="/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.proteins.bacteria.legend.ppt", width=30, height=20)

9 A.7) Separating the legend from the tree to position it below the tree

tp_bacteria <- tp6
tp_protein <- tp5.5

require(cowplot)
## Loading required package: cowplot
leg1 <- get_legend(tp_bacteria)
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
tp7 <- tp6 + theme(legend.position="none")
plot_grid(tp7, leg1, ncol=1, rel_widths=c(0.07, 0.5))
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

tp7 <- tp6 +
      theme(legend.position="bottom", # the position of legend.
          legend.background=element_rect(), # the background of legend.
          legend.title=element_text(size=11), # the title size of legend.
          legend.text=element_text(size=9), # the text size of legend.
          legend.spacing.y = unit(0.02, "cm")
      )
tp7
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).

#ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.proteins.bacterialorigin.pdf",width=15,height=30) #save the tree in a pdf
#graph2ppt(file="/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.proteins.bacterialorigin.ppt", width=12, height=12)

Last updated 2022/01/07